Beer and Breweries Exploration

Introduction: This is an exploratory data analysis for the data set between Beers.csv and Breweries.csv for CEO and CFO of Budweiser

Loading the datasets

Doing intial inspection of data through various methods below.

# dataset should be in the same folder of this RMD file
Beers = read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/Beers.csv",header = TRUE) #loading beers dataset
Breweries = read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/Breweries.csv",header = TRUE) #loading breweries dataset
#below this line is for self analyzation
#summary(Beers)
#str(Beers)
##Beers$IBU
#summary(Breweries)
#str(Breweries)
#Above this line is for self analyzing can be deleted later
#Turn Breweries State column into a factor
Breweries$State = as.factor(Breweries$State)

How many breweries are present in each state?

  • As we can see by the plot below, each state’s breweries count is displayed in a bar chart
  • To make it easier to see which state has the most breweries, we assorted the breweries count by descending order
library(tidyverse) 
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
num_Breweries_by_state = Breweries %>% group_by(State) %>%
  summarise(count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
num_Breweries_by_state
## # A tibble: 51 x 2
##    State count
##    <fct> <int>
##  1 " AK"     7
##  2 " AL"     3
##  3 " AR"     2
##  4 " AZ"    11
##  5 " CA"    39
##  6 " CO"    47
##  7 " CT"     8
##  8 " DC"     1
##  9 " DE"     2
## 10 " FL"    15
## # … with 41 more rows
#As we can see the number of breweries per state is in the list below, to see this better we will use a plot to show results
ggplot(data=num_Breweries_by_state)+
  geom_bar(mapping=aes(x=State,y=count,fill=State),stat="identity") +
  ggtitle("Breweries count by state")+xlab("State")+ylab("Count of Breweries")

#To see this by descend of count
ggplot(data=num_Breweries_by_state)+
  geom_bar(mapping=aes(x=reorder(State,-count),y=count,fill=State),stat="identity") +
  ggtitle("Breweries count by state")+xlab("State")+ylab("Count of Breweries")

Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.

Beers = Beers %>% rename(Brew_ID= Brewery_id)
Beers.with.Breweries = left_join(Beers,Breweries, by = "Brew_ID")
Beers.with.Breweries = Beers.with.Breweries %>% rename(Beer_Name= Name.x)
Beers.with.Breweries = Beers.with.Breweries %>% rename(Brew_Name= Name.y)
head(Beers.with.Breweries,6)
##             Beer_Name Beer_ID   ABV IBU Brew_ID                          Style
## 1            Pub Beer    1436 0.050  NA     409            American Pale Lager
## 2         Devil's Cup    2265 0.066  NA     178        American Pale Ale (APA)
## 3 Rise of the Phoenix    2264 0.071  NA     178                   American IPA
## 4            Sinister    2263 0.090  NA     178 American Double / Imperial IPA
## 5       Sex and Candy    2262 0.075  NA     178                   American IPA
## 6        Black Exodus    2261 0.077  NA     178                  Oatmeal Stout
##   Ounces                 Brew_Name City State
## 1     12 10 Barrel Brewing Company Bend    OR
## 2     12       18th Street Brewery Gary    IN
## 3     12       18th Street Brewery Gary    IN
## 4     12       18th Street Brewery Gary    IN
## 5     12       18th Street Brewery Gary    IN
## 6     12       18th Street Brewery Gary    IN

Address the missing values in each column

  • There are 62 missing values on ABV column, and there are 1005 missing values on IBU column
  • Since 1005/2410 is too big of a percentage of missing value for column IBU, I decided to use Predictive mean matching method(widely used statistical imputation method for missing values, first proposed by Donald B. Rubin in 1986 and R. J. A. Little in 1988) to fill the missing data.
  • After generating and filling in the missing data, through density plot below, we can see the generated filled in value matches the original density plot. Thus minimize any bias from the researcher of arbitrarily filling in value or overeasily using the overall mean for all missing value
summary(Beers.with.Breweries)
##   Beer_Name            Beer_ID            ABV               IBU        
##  Length:2410        Min.   :   1.0   Min.   :0.00100   Min.   :  4.00  
##  Class :character   1st Qu.: 808.2   1st Qu.:0.05000   1st Qu.: 21.00  
##  Mode  :character   Median :1453.5   Median :0.05600   Median : 35.00  
##                     Mean   :1431.1   Mean   :0.05977   Mean   : 42.71  
##                     3rd Qu.:2075.8   3rd Qu.:0.06700   3rd Qu.: 64.00  
##                     Max.   :2692.0   Max.   :0.12800   Max.   :138.00  
##                                      NA's   :62        NA's   :1005    
##     Brew_ID         Style               Ounces       Brew_Name        
##  Min.   :  1.0   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 94.0   Class :character   1st Qu.:12.00   Class :character  
##  Median :206.0   Mode  :character   Median :12.00   Mode  :character  
##  Mean   :232.7                      Mean   :13.59                     
##  3rd Qu.:367.0                      3rd Qu.:16.00                     
##  Max.   :558.0                      Max.   :32.00                     
##                                                                       
##      City               State     
##  Length:2410         CO    : 265  
##  Class :character    CA    : 183  
##  Mode  :character    MI    : 162  
##                      IN    : 139  
##                      TX    : 130  
##                      OR    : 125  
##                     (Other):1406
library(mice) #Load mice library to analyze the pattern of missing data
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
md.pattern(Beers.with.Breweries)

##      Beer_Name Beer_ID Brew_ID Style Ounces Brew_Name City State ABV  IBU     
## 1405         1       1       1     1      1         1    1     1   1    1    0
## 943          1       1       1     1      1         1    1     1   1    0    1
## 62           1       1       1     1      1         1    1     1   0    0    2
##              0       0       0     0      0         0    0     0  62 1005 1067
# Since there is large amont of data missing in IBM column 
#Try to impute the missing data with Predictive mean Matching method
tempData <- mice(Beers.with.Breweries,m=5,maxit=50,meth='pmm',seed=20)
## 
##  iter imp variable
##   1   1  ABV  IBU
##   1   2  ABV  IBU
##   1   3  ABV  IBU
##   1   4  ABV  IBU
##   1   5  ABV  IBU
##   2   1  ABV  IBU
##   2   2  ABV  IBU
##   2   3  ABV  IBU
##   2   4  ABV  IBU
##   2   5  ABV  IBU
##   3   1  ABV  IBU
##   3   2  ABV  IBU
##   3   3  ABV  IBU
##   3   4  ABV  IBU
##   3   5  ABV  IBU
##   4   1  ABV  IBU
##   4   2  ABV  IBU
##   4   3  ABV  IBU
##   4   4  ABV  IBU
##   4   5  ABV  IBU
##   5   1  ABV  IBU
##   5   2  ABV  IBU
##   5   3  ABV  IBU
##   5   4  ABV  IBU
##   5   5  ABV  IBU
##   6   1  ABV  IBU
##   6   2  ABV  IBU
##   6   3  ABV  IBU
##   6   4  ABV  IBU
##   6   5  ABV  IBU
##   7   1  ABV  IBU
##   7   2  ABV  IBU
##   7   3  ABV  IBU
##   7   4  ABV  IBU
##   7   5  ABV  IBU
##   8   1  ABV  IBU
##   8   2  ABV  IBU
##   8   3  ABV  IBU
##   8   4  ABV  IBU
##   8   5  ABV  IBU
##   9   1  ABV  IBU
##   9   2  ABV  IBU
##   9   3  ABV  IBU
##   9   4  ABV  IBU
##   9   5  ABV  IBU
##   10   1  ABV  IBU
##   10   2  ABV  IBU
##   10   3  ABV  IBU
##   10   4  ABV  IBU
##   10   5  ABV  IBU
##   11   1  ABV  IBU
##   11   2  ABV  IBU
##   11   3  ABV  IBU
##   11   4  ABV  IBU
##   11   5  ABV  IBU
##   12   1  ABV  IBU
##   12   2  ABV  IBU
##   12   3  ABV  IBU
##   12   4  ABV  IBU
##   12   5  ABV  IBU
##   13   1  ABV  IBU
##   13   2  ABV  IBU
##   13   3  ABV  IBU
##   13   4  ABV  IBU
##   13   5  ABV  IBU
##   14   1  ABV  IBU
##   14   2  ABV  IBU
##   14   3  ABV  IBU
##   14   4  ABV  IBU
##   14   5  ABV  IBU
##   15   1  ABV  IBU
##   15   2  ABV  IBU
##   15   3  ABV  IBU
##   15   4  ABV  IBU
##   15   5  ABV  IBU
##   16   1  ABV  IBU
##   16   2  ABV  IBU
##   16   3  ABV  IBU
##   16   4  ABV  IBU
##   16   5  ABV  IBU
##   17   1  ABV  IBU
##   17   2  ABV  IBU
##   17   3  ABV  IBU
##   17   4  ABV  IBU
##   17   5  ABV  IBU
##   18   1  ABV  IBU
##   18   2  ABV  IBU
##   18   3  ABV  IBU
##   18   4  ABV  IBU
##   18   5  ABV  IBU
##   19   1  ABV  IBU
##   19   2  ABV  IBU
##   19   3  ABV  IBU
##   19   4  ABV  IBU
##   19   5  ABV  IBU
##   20   1  ABV  IBU
##   20   2  ABV  IBU
##   20   3  ABV  IBU
##   20   4  ABV  IBU
##   20   5  ABV  IBU
##   21   1  ABV  IBU
##   21   2  ABV  IBU
##   21   3  ABV  IBU
##   21   4  ABV  IBU
##   21   5  ABV  IBU
##   22   1  ABV  IBU
##   22   2  ABV  IBU
##   22   3  ABV  IBU
##   22   4  ABV  IBU
##   22   5  ABV  IBU
##   23   1  ABV  IBU
##   23   2  ABV  IBU
##   23   3  ABV  IBU
##   23   4  ABV  IBU
##   23   5  ABV  IBU
##   24   1  ABV  IBU
##   24   2  ABV  IBU
##   24   3  ABV  IBU
##   24   4  ABV  IBU
##   24   5  ABV  IBU
##   25   1  ABV  IBU
##   25   2  ABV  IBU
##   25   3  ABV  IBU
##   25   4  ABV  IBU
##   25   5  ABV  IBU
##   26   1  ABV  IBU
##   26   2  ABV  IBU
##   26   3  ABV  IBU
##   26   4  ABV  IBU
##   26   5  ABV  IBU
##   27   1  ABV  IBU
##   27   2  ABV  IBU
##   27   3  ABV  IBU
##   27   4  ABV  IBU
##   27   5  ABV  IBU
##   28   1  ABV  IBU
##   28   2  ABV  IBU
##   28   3  ABV  IBU
##   28   4  ABV  IBU
##   28   5  ABV  IBU
##   29   1  ABV  IBU
##   29   2  ABV  IBU
##   29   3  ABV  IBU
##   29   4  ABV  IBU
##   29   5  ABV  IBU
##   30   1  ABV  IBU
##   30   2  ABV  IBU
##   30   3  ABV  IBU
##   30   4  ABV  IBU
##   30   5  ABV  IBU
##   31   1  ABV  IBU
##   31   2  ABV  IBU
##   31   3  ABV  IBU
##   31   4  ABV  IBU
##   31   5  ABV  IBU
##   32   1  ABV  IBU
##   32   2  ABV  IBU
##   32   3  ABV  IBU
##   32   4  ABV  IBU
##   32   5  ABV  IBU
##   33   1  ABV  IBU
##   33   2  ABV  IBU
##   33   3  ABV  IBU
##   33   4  ABV  IBU
##   33   5  ABV  IBU
##   34   1  ABV  IBU
##   34   2  ABV  IBU
##   34   3  ABV  IBU
##   34   4  ABV  IBU
##   34   5  ABV  IBU
##   35   1  ABV  IBU
##   35   2  ABV  IBU
##   35   3  ABV  IBU
##   35   4  ABV  IBU
##   35   5  ABV  IBU
##   36   1  ABV  IBU
##   36   2  ABV  IBU
##   36   3  ABV  IBU
##   36   4  ABV  IBU
##   36   5  ABV  IBU
##   37   1  ABV  IBU
##   37   2  ABV  IBU
##   37   3  ABV  IBU
##   37   4  ABV  IBU
##   37   5  ABV  IBU
##   38   1  ABV  IBU
##   38   2  ABV  IBU
##   38   3  ABV  IBU
##   38   4  ABV  IBU
##   38   5  ABV  IBU
##   39   1  ABV  IBU
##   39   2  ABV  IBU
##   39   3  ABV  IBU
##   39   4  ABV  IBU
##   39   5  ABV  IBU
##   40   1  ABV  IBU
##   40   2  ABV  IBU
##   40   3  ABV  IBU
##   40   4  ABV  IBU
##   40   5  ABV  IBU
##   41   1  ABV  IBU
##   41   2  ABV  IBU
##   41   3  ABV  IBU
##   41   4  ABV  IBU
##   41   5  ABV  IBU
##   42   1  ABV  IBU
##   42   2  ABV  IBU
##   42   3  ABV  IBU
##   42   4  ABV  IBU
##   42   5  ABV  IBU
##   43   1  ABV  IBU
##   43   2  ABV  IBU
##   43   3  ABV  IBU
##   43   4  ABV  IBU
##   43   5  ABV  IBU
##   44   1  ABV  IBU
##   44   2  ABV  IBU
##   44   3  ABV  IBU
##   44   4  ABV  IBU
##   44   5  ABV  IBU
##   45   1  ABV  IBU
##   45   2  ABV  IBU
##   45   3  ABV  IBU
##   45   4  ABV  IBU
##   45   5  ABV  IBU
##   46   1  ABV  IBU
##   46   2  ABV  IBU
##   46   3  ABV  IBU
##   46   4  ABV  IBU
##   46   5  ABV  IBU
##   47   1  ABV  IBU
##   47   2  ABV  IBU
##   47   3  ABV  IBU
##   47   4  ABV  IBU
##   47   5  ABV  IBU
##   48   1  ABV  IBU
##   48   2  ABV  IBU
##   48   3  ABV  IBU
##   48   4  ABV  IBU
##   48   5  ABV  IBU
##   49   1  ABV  IBU
##   49   2  ABV  IBU
##   49   3  ABV  IBU
##   49   4  ABV  IBU
##   49   5  ABV  IBU
##   50   1  ABV  IBU
##   50   2  ABV  IBU
##   50   3  ABV  IBU
##   50   4  ABV  IBU
##   50   5  ABV  IBU
## Warning: Number of logged events: 254
#summary(tempData)
# completed dataset after adding in generated predictive values
completedData <- complete(tempData,1)
#head(completedData)
# Density plot original vs imputed dataset
densityplot(tempData)

#Note: idea used above to impute data is from link below:
#https://datascienceplus.com/imputing-missing-data-with-r-mice-package/

Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare

  • First plotted Median alcohol content by state, one with modified data set, and the other one with non-modified data set
  • Then plotted Median international bitterness unit for each state, both on the modified data set and non-modified
  • We can see between the modified data and non-modified data provides very different answers. If possible, going back to try to complete the missing data will make the graphics more accurate. But this is a random estimate of the actual value
#Compute and display Median of ABV and IBU by state:
median = completedData %>% group_by(State) %>%
  summarize(median_ABV=median(ABV),median_IBU=median(IBU))
## `summarise()` ungrouping output (override with `.groups` argument)
median
## # A tibble: 51 x 3
##    State median_ABV median_IBU
##    <fct>      <dbl>      <dbl>
##  1 " AK"     0.056        35  
##  2 " AL"     0.06         39.5
##  3 " AR"     0.052        72  
##  4 " AZ"     0.055        28  
##  5 " CA"     0.058        36  
##  6 " CO"     0.061        35  
##  7 " CT"     0.06         32  
##  8 " DC"     0.0625       75  
##  9 " DE"     0.0705       52  
## 10 " FL"     0.0555       36.5
## # … with 41 more rows
#Draw Bar Charts to compare
#First plot median of alcohol content using modified data
median %>% ggplot()+
  geom_bar(mapping=aes(x=reorder(State,-median_ABV),y=median_ABV,fill=State),stat="identity") +
  ggtitle("Median Alcohol content by State on modified dataset")+xlab("State")+ylab("Alcohol Content Percentage")

#Below is result of using complete data set with missing data to plot median of alcohol content
Beers.with.Breweries %>% group_by(State) %>%
  summarize(median_ABV=median(ABV),median_IBU=median(IBU))%>% ggplot()+
  geom_bar(mapping=aes(x=reorder(State,-median_ABV),y=median_ABV,fill=State),stat="identity")+
  ggtitle("Median Alcohol content by State on non-Modified dataset")+xlab("State")+ylab("Alcohol Content Percentage")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 18 rows containing missing values (position_stack).

#Below is result of plotting median international bitterness unit for each state on modified data set
median %>% ggplot()+
  geom_bar(mapping=aes(x=reorder(State,-median_IBU),y=median_IBU,fill=State),stat="identity") +
  ggtitle("Median International Bitterness Unit by State on modified dataset")+xlab("State")+ylab("International Bitterness Unit")

#Below is result of using complete data set with missing data to plot median of alcohol content
Beers.with.Breweries %>% group_by(State) %>%
  summarize(median_ABV=median(ABV),median_IBU=median(IBU))%>% ggplot()+
  geom_bar(mapping=aes(x=reorder(State,-median_IBU),y=median_IBU,fill=State),stat="identity") +
  ggtitle("Median International Bitterness Unit by State on non-modified dataset")+xlab("State")+ylab("International Bitterness Unit")
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 47 rows containing missing values (position_stack).

Exploring which state has the maximum alcoholic (ABV) beer and which state has the most bitter (IBU) beer

  • As we can see below, Colorado ‘CO’ has the maximum alcoholic(ABV) beer, it is (0.128)
  • This above result remain unchanged with the original data set
  • As we can see, Oregon and Michigan have the most bitter beer
  • Only Oregon has the most bitter beer in the unchanged data set. It means the Michigan bee bitterness value was imputed
# Discover which state has the maximum alcoholic beer
head(completedData %>%
  arrange(desc(ABV)) %>% 
  select(State,ABV,Beer_Name))
##   State   ABV                                            Beer_Name
## 1    CO 0.128 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale
## 2    KY 0.125                                       London Balling
## 3    IN 0.120                                                 Csar
## 4    CO 0.104     Lee Hill Series Vol. 4 - Manhattan Style Rye Ale
## 5    NY 0.100                                               4Beans
## 6    CA 0.099                                        Lower De Boom
# Discover with un-changed data set
head(Beers.with.Breweries %>%
  arrange(desc(ABV)) %>% 
  select(State,ABV,Beer_Name))
##   State   ABV                                            Beer_Name
## 1    CO 0.128 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale
## 2    KY 0.125                                       London Balling
## 3    IN 0.120                                                 Csar
## 4    CO 0.104     Lee Hill Series Vol. 4 - Manhattan Style Rye Ale
## 5    NY 0.100                                               4Beans
## 6    CA 0.099                                        Lower De Boom
# Discover which state has the most bitter (IBU) beer
head(completedData %>%
  arrange(desc(IBU)) %>% 
  select(State,IBU,Beer_Name))
##   State IBU                 Beer_Name
## 1    OR 138 Bitter Bitch Imperial IPA
## 2    MI 138                    Zaison
## 3    MI 138       Lemon Shandy Tripel
## 4    VA 135        Troopers Alley IPA
## 5    MI 130      Escoffier Bretta Ale
## 6    MA 130             Dead-Eye DIPA
# Discover with un-changed data set
head(Beers.with.Breweries %>%
  arrange(desc(IBU)) %>% 
  select(State,IBU,Beer_Name))
##   State IBU                       Beer_Name
## 1    OR 138       Bitter Bitch Imperial IPA
## 2    VA 135              Troopers Alley IPA
## 3    MA 130                   Dead-Eye DIPA
## 4    OH 126 Bay of Bengal Double IPA (2014)
## 5    MN 120                    Abrasive Ale
## 6    VT 120                    Heady Topper

Comment on the summary statistics and distribution of the ABV variable.

  • For the adjusted data set, we have
  • Minimum ABV value: 0.001 (minimum value of the group, CEO and CFO might be interested)
  • First quartile ABV: 0.05
  • Median of ABV: 0.056 (center among the entire group, CEO and CFO might be interested)
  • Mean of ABV: 0.05975 (Average of the entire group, CEO and CFO might be interested)
  • Third Quartile ABV: 0.067
  • Maximum ABV: 0.128 (maximum value of the group, CEO and CFO might be interested)
  • Comparing the adjusted Data and Unadjusted data, Statistics doesn’t change much at all since it is imputed by Predictive mean matching method
# Summary of adjusted Data
summary(completedData)
##   Beer_Name            Beer_ID            ABV               IBU        
##  Length:2410        Min.   :   1.0   Min.   :0.00100   Min.   :  4.00  
##  Class :character   1st Qu.: 808.2   1st Qu.:0.05000   1st Qu.: 21.00  
##  Mode  :character   Median :1453.5   Median :0.05600   Median : 35.00  
##                     Mean   :1431.1   Mean   :0.05975   Mean   : 41.99  
##                     3rd Qu.:2075.8   3rd Qu.:0.06700   3rd Qu.: 61.00  
##                     Max.   :2692.0   Max.   :0.12800   Max.   :138.00  
##                                                                        
##     Brew_ID         Style               Ounces       Brew_Name        
##  Min.   :  1.0   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 94.0   Class :character   1st Qu.:12.00   Class :character  
##  Median :206.0   Mode  :character   Median :12.00   Mode  :character  
##  Mean   :232.7                      Mean   :13.59                     
##  3rd Qu.:367.0                      3rd Qu.:16.00                     
##  Max.   :558.0                      Max.   :32.00                     
##                                                                       
##      City               State     
##  Length:2410         CO    : 265  
##  Class :character    CA    : 183  
##  Mode  :character    MI    : 162  
##                      IN    : 139  
##                      TX    : 130  
##                      OR    : 125  
##                     (Other):1406
# Summary of unadjusted Data
summary(Beers.with.Breweries)
##   Beer_Name            Beer_ID            ABV               IBU        
##  Length:2410        Min.   :   1.0   Min.   :0.00100   Min.   :  4.00  
##  Class :character   1st Qu.: 808.2   1st Qu.:0.05000   1st Qu.: 21.00  
##  Mode  :character   Median :1453.5   Median :0.05600   Median : 35.00  
##                     Mean   :1431.1   Mean   :0.05977   Mean   : 42.71  
##                     3rd Qu.:2075.8   3rd Qu.:0.06700   3rd Qu.: 64.00  
##                     Max.   :2692.0   Max.   :0.12800   Max.   :138.00  
##                                      NA's   :62        NA's   :1005    
##     Brew_ID         Style               Ounces       Brew_Name        
##  Min.   :  1.0   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 94.0   Class :character   1st Qu.:12.00   Class :character  
##  Median :206.0   Mode  :character   Median :12.00   Mode  :character  
##  Mean   :232.7                      Mean   :13.59                     
##  3rd Qu.:367.0                      3rd Qu.:16.00                     
##  Max.   :558.0                      Max.   :32.00                     
##                                                                       
##      City               State     
##  Length:2410         CO    : 265  
##  Class :character    CA    : 183  
##  Mode  :character    MI    : 162  
##                      IN    : 139  
##                      TX    : 130  
##                      OR    : 125  
##                     (Other):1406

Is there an apparent relationship between the bitterness of the beer and its alcoholic content?

  • We can see from the scatter plot, there seem to be some sort of relationship between the two
  • As ABV value increases, IBU seem to also increase
  • This relationship seem to persist when we compare to the unmodified data
#first explore modified data
completedData %>% select(ABV, IBU, State) %>%
  ggplot(aes(x=ABV,y=IBU)) +
  geom_point(position="jitter") +ggtitle("IBU vs. ABV -- modified data")

#next explore unmodified data
Beers.with.Breweries %>% select(ABV, IBU, State) %>%
  ggplot(aes(x=ABV,y=IBU)) +
  geom_point(position="jitter") +ggtitle("IBU vs. ABV -- unmodified data")
## Warning: Removed 1005 rows containing missing values (geom_point).

Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).

  • There are total of 618 Beer name has Ale in it
  • 41 Beer name has India Pale Ales in it
  • 577 are other types of Ale
  • By using KNN model to try classifying India Pale Ales against other Ales
  • Initially used k=5 (five neighbors to compare): really great Accuracy rate 0.9355, however very low sensitivity rate with the model (correctly predicting India Pale Ale), great Specificity rate (correctly predicting other types of Ales)
  • Next we tried different random seed number and see if this is due to chance: by trying another randomn seed model, got accuracy rate to 0.9597, increased Sensitivity to 0.33, and Super great Specificity: 0.99 (99% chance of predicting other types of Ales)
  • Then we explored what is the best k value to give the most accuracy, after exploring, it is giving k>30, however, this reduced the Sensitivity (successfully predicting India Pale Ales) to almost minimal. It leads us to explore what k gives best Sensitivity rate(try to improve our model sensitivity rate)?
  • After running such test, it suggest a low k number provides better Sensitivity
  • In this scenario, I consider k=3 might be the best compromise to balance the model
  • By using k=3, seed=101, we got Accuracy rate=0.92, while Sensitivity is around 0.33(successfully predicting India Pale Ales), Specificity=0.95(Successfully predict other Ales)
  • Getting an average result of one hundred random generations from KNN model with k=3, Average Accuracy = 0.93, Average Sensitivity = 0.24, Average Specificity = 0.98
  • Conclusion: Even we have a great accuracy rate of predicting with our KNN model using IBU and ABV,we have a poor sensitivity rate to predict India Pale Ales correctly, this maybe due to the fact that the other Ales groups covers majority of the data set, and thus there is a high chance for any Ale to be categorized to other Ale group based on IBU and adjusted ABV alone.
  • Next try Naive Bayes Models to use probabilities to estimate different Ales based on IBU and ABV
  • After trying Naive Bayes Model, it can give an improved Sensitivity rate with the right random generator
  • By getting the average of one hundred random generations from NB model, Average Accuracy = 0.90, Average Sensitivity= 0.15, Average Specificity = 0.95
  • Conclusion with NB model - not as good as KNN with k=3
#In order to investigate the difference respect to IBU and ABV, first extract all name with Ales
#getting all bear name with ale in it (ignore the case factor)
all_ales = completedData %>% filter(str_detect(completedData$Beer_Name,regex("Ale",ignore.case=TRUE)))
india_pale_ales = all_ales %>% 
  filter(str_detect(all_ales$Beer_Name,regex("India Pale Ale",ignore.case=TRUE)))
other_ales = all_ales %>% 
  filter(!str_detect(all_ales$Beer_Name,regex("India Pale Ale",ignore.case=TRUE)))
# in order to effectively use KNN model, I decided to standardize percentage scale of ABV, so the effect of distance from ABV and IBU are roughly the same. I choose to time each ABV value by 1000 to accomplish this standarization
all_ales$ABV = all_ales$ABV * 1000
# Change other ales name to other ales in order to use KNN model to test whether we can use IBU and ABV to distinguish IPAs from others
all_ales = all_ales %>% 
  mutate(India.Pale.Ale.Or.Else = ifelse(str_detect(Beer_Name,regex("India Pale Ale",ignore.case=TRUE)),"India Pale Ale","Other Ale"))
#all_ales$India.Pale.Ale.Or.Else
# Start KNN test to see how good it is to use ABV and IBU to distinguish the Ales
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
all_ales$India.Pale.Ale.Or.Else = as.factor(all_ales$India.Pale.Ale.Or.Else)
#Keep my result reproducible initially tried set.seed(100), try k=5
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 5)
table(classifications,test$India.Pale.Ale.Or.Else)
##                 
## classifications  India Pale Ale Other Ale
##   India Pale Ale              2         2
##   Other Ale                   4       116
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
## 
##                 
## classifications  India Pale Ale Other Ale
##   India Pale Ale              2         2
##   Other Ale                   4       116
##                                          
##                Accuracy : 0.9516         
##                  95% CI : (0.8977, 0.982)
##     No Information Rate : 0.9516         
##     P-Value [Acc > NIR] : 0.6063         
##                                          
##                   Kappa : 0.3758         
##                                          
##  Mcnemar's Test P-Value : 0.6831         
##                                          
##             Sensitivity : 0.33333        
##             Specificity : 0.98305        
##          Pos Pred Value : 0.50000        
##          Neg Pred Value : 0.96667        
##              Prevalence : 0.04839        
##          Detection Rate : 0.01613        
##    Detection Prevalence : 0.03226        
##       Balanced Accuracy : 0.65819        
##                                          
##        'Positive' Class : India Pale Ale 
## 
# explore best possible K value for accuracy
set.seed(101)
iterations = 500
numks = 50
masterAcc = matrix(nrow = iterations, ncol = numks)
  
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
for(i in 1:numks)
{
  classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = i)
  table(classifications,test$India.Pale.Ale.Or.Else)
  CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
  masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l",main="mean Accuracy vs. different K (number of neighbor used to predict)",
     ylab="Mean Accuracy",xlab="k used")

#Try k=30 after finding it might give the most accuracy
#Keep my result reproducible initially tried set.seed(101)
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 30)
table(classifications,test$India.Pale.Ale.Or.Else)
##                 
## classifications  India Pale Ale Other Ale
##   India Pale Ale              0         0
##   Other Ale                   6       118
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
## 
##                 
## classifications  India Pale Ale Other Ale
##   India Pale Ale              0         0
##   Other Ale                   6       118
##                                          
##                Accuracy : 0.9516         
##                  95% CI : (0.8977, 0.982)
##     No Information Rate : 0.9516         
##     P-Value [Acc > NIR] : 0.60634        
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : 0.04123        
##                                          
##             Sensitivity : 0.00000        
##             Specificity : 1.00000        
##          Pos Pred Value :     NaN        
##          Neg Pred Value : 0.95161        
##              Prevalence : 0.04839        
##          Detection Rate : 0.00000        
##    Detection Prevalence : 0.00000        
##       Balanced Accuracy : 0.50000        
##                                          
##        'Positive' Class : India Pale Ale 
## 
# explore best possible K value for sensitivity
set.seed(101)
iterations = 500
numks = 50
masterSen = matrix(nrow = iterations, ncol = numks)
  
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
for(i in 1:numks)
{
  classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = i)
  table(classifications,test$India.Pale.Ale.Or.Else)
  CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
  masterSen[j,i] = CM$byClass[1]
}
}
MeanSen = colMeans(masterSen)
plot(seq(1,numks,1),MeanSen, type = "l",main="mean Sensitivity vs. different K (number of neighbor used to predict)",
     ylab="Mean Sensitivity",xlab="k used")

#Try k=3 after finding it might give the most Sensitivity
#Keep my result reproducible initially tried set.seed(101)
set.seed(101)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
# try k=5
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 3)
table(classifications,test$India.Pale.Ale.Or.Else)
##                 
## classifications  India Pale Ale Other Ale
##   India Pale Ale              2         6
##   Other Ale                   4       112
cm = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
cm
## Confusion Matrix and Statistics
## 
##                 
## classifications  India Pale Ale Other Ale
##   India Pale Ale              2         6
##   Other Ale                   4       112
##                                           
##                Accuracy : 0.9194          
##                  95% CI : (0.8567, 0.9606)
##     No Information Rate : 0.9516          
##     P-Value [Acc > NIR] : 0.9614          
##                                           
##                   Kappa : 0.2439          
##                                           
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.33333         
##             Specificity : 0.94915         
##          Pos Pred Value : 0.25000         
##          Neg Pred Value : 0.96552         
##              Prevalence : 0.04839         
##          Detection Rate : 0.01613         
##    Detection Prevalence : 0.06452         
##       Balanced Accuracy : 0.64124         
##                                           
##        'Positive' Class : India Pale Ale  
## 
#Get an average percentage of Accuracy, Sensitivity, and Specificity of KNN model k=3
set.seed(101)
iterations = 100
numks = 50
masterAcc = matrix(nrow = iterations, ncol = 1)
masterSen = matrix(nrow = iterations, ncol = 1)
masterSpec = matrix(nrow = iterations, ncol = 1)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
classifications = knn(train[,3:4],test[,3:4],train$India.Pale.Ale.Or.Else, prob = TRUE, k = 3)
CM = confusionMatrix(table(classifications,test$India.Pale.Ale.Or.Else))
masterAcc[j,1]=CM$overall[1]
masterSen[j,1]=CM$byClass[1]
masterSpec[j,1]=CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.9328226
MeanSen
## [1] 0.2558146
MeanSpec
## [1] 0.9819341
# Try using Naive Bayes see if it will improve results, split 0.8:
set.seed(102)
splitPerc = 0.8
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
#NB model
model = naiveBayes(train[,3:4],train$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(predict(model,test[,3:4]),test$India.Pale.Ale.Or.Else))
CM
## Confusion Matrix and Statistics
## 
##                 
##                  India Pale Ale Other Ale
##   India Pale Ale              4         2
##   Other Ale                   7       111
##                                           
##                Accuracy : 0.9274          
##                  95% CI : (0.8667, 0.9663)
##     No Information Rate : 0.9113          
##     P-Value [Acc > NIR] : 0.3304          
##                                           
##                   Kappa : 0.4352          
##                                           
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.36364         
##             Specificity : 0.98230         
##          Pos Pred Value : 0.66667         
##          Neg Pred Value : 0.94068         
##              Prevalence : 0.08871         
##          Detection Rate : 0.03226         
##    Detection Prevalence : 0.04839         
##       Balanced Accuracy : 0.67297         
##                                           
##        'Positive' Class : India Pale Ale  
## 
# Try getting average of Accuracy, Sensitivity and Specificity rate using NB model from 100 random generators
set.seed(101)
splitPerc = .7
iterations = 100
masterAcc = matrix(nrow = iterations, ncol = 1)
masterSen = matrix(nrow = iterations, ncol = 1)
masterSpec = matrix(nrow = iterations, ncol = 1)
for(j in 1:iterations)
{
trainIndices = sample(1:dim(all_ales)[1],round(splitPerc * dim(all_ales)[1]))
train = all_ales[trainIndices,]
test = all_ales[-trainIndices,]
#NB model
model = naiveBayes(train[,3:4],train$India.Pale.Ale.Or.Else)
CM = confusionMatrix(table(predict(model,test[,3:4]),test$India.Pale.Ale.Or.Else))
masterAcc[j,1]=CM$overall[1]
masterSen[j,1]=CM$byClass[1]
masterSpec[j,1]=CM$byClass[2]

}

MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.9011892
MeanSen
## [1] 0.1544105
MeanSpec
## [1] 0.9545286

Further exploration – Free Style

  • Explore Types of Ales produced in each city (Which cities contribute to most varieties of Ales?)
  • Explore average ABV content produced by state (Which states produces higher average ABV content, which states lower?)
#import map library
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Import US Cities location data
uscities <- read.csv("/Users/mingyang/Desktop/SMU/DoingDS_Fall2020/MSDS6306-Case-Study1/uscities.csv",header = TRUE)
uscities = uscities%>%rename(City = city) 
uscities = uscities%>% group_by(City) %>% filter(row_number()==1)
# Getting all Ales Beer available
allAles2 = completedData %>% filter(str_detect(completedData$Beer_Name,regex("Ale",ignore.case=TRUE)))
dataWithMap = left_join(allAles2,uscities, by = "City")
str(dataWithMap)
## 'data.frame':    618 obs. of  26 variables:
##  $ Beer_Name   : chr  "British Pale Ale (2010)" "British Pale Ale" "Klickitat Pale Ale" "Dusty Trail Pale Ale" ...
##  $ Beer_ID     : int  866 48 2402 1474 2592 1818 1326 811 78 76 ...
##  $ ABV         : num  0.054 0.054 0.053 0.054 0.059 0.042 0.058 0.058 0.058 0.055 ...
##  $ IBU         : int  30 30 36 55 24 15 15 15 15 28 ...
##  $ Brew_ID     : int  482 482 118 402 36 172 172 172 172 172 ...
##  $ Style       : chr  "English Pale Ale" "English Pale Ale" "American Pale Ale (APA)" "American Pale Ale (APA)" ...
##  $ Ounces      : num  16 16 12 16 12 12 12 12 12 12 ...
##  $ Brew_Name   : chr  "7 Seas Brewing Company" "7 Seas Brewing Company" "Alameda Brewing" "Amnesia Brewing Company" ...
##  $ City        : chr  "Gig Harbor" "Gig Harbor" "Portland" "Washougal" ...
##  $ State       : Factor w/ 51 levels " AK"," AL"," AR",..: 48 48 38 48 5 5 5 5 5 5 ...
##  $ city_ascii  : chr  "Gig Harbor" "Gig Harbor" "Portland" "Washougal" ...
##  $ state_id    : chr  "WA" "WA" "OR" "WA" ...
##  $ state_name  : chr  "Washington" "Washington" "Oregon" "Washington" ...
##  $ county_fips : int  53053 53053 41051 53011 6075 29053 29053 29053 29053 29053 ...
##  $ county_name : chr  "Pierce" "Pierce" "Multnomah" "Clark" ...
##  $ lat         : num  47.3 47.3 45.5 45.6 37.8 ...
##  $ lng         : num  -123 -123 -123 -122 -122 ...
##  $ population  : num  10717 10717 2074775 16107 3592294 ...
##  $ density     : num  701 701 1894 1073 7256 ...
##  $ source      : chr  "polygon" "polygon" "polygon" "polygon" ...
##  $ military    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ incorporated: logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ timezone    : chr  "America/Los_Angeles" "America/Los_Angeles" "America/Los_Angeles" "America/Los_Angeles" ...
##  $ ranking     : int  3 3 1 3 1 3 3 3 3 3 ...
##  $ zips        : chr  "98332 98335" "98332 98335" "97227 97221 97220 97229 97203 97202 97201 97206 97205 97204 97209 97208 97266 97218 97236 97232 97233 97230 972"| __truncated__ "98671" ...
##  $ id          : int  1840019855 1840019855 1840019941 1840021190 1840021543 1840007420 1840007420 1840007420 1840007420 1840007420 ...
#dataWithMap %>% filter((is.na(lng))|(is.na(lat))) %>% select(Brew_Name,City)
dataWithMap2 = dataWithMap %>% select(Beer_Name,City,lat,lng,state_name)
head(dataWithMap2,100)
##                                          Beer_Name             City     lat
## 1                          British Pale Ale (2010)       Gig Harbor 47.3353
## 2                                 British Pale Ale       Gig Harbor 47.3353
## 3                               Klickitat Pale Ale         Portland 45.5372
## 4                             Dusty Trail Pale Ale        Washougal 45.5822
## 5                                      Liberty Ale    San Francisco 37.7562
## 6                              Keebarlin' Pale Ale        Boonville 38.9588
## 7                                  Boont Amber Ale        Boonville 38.9588
## 8                           Boont Amber Ale (2010)        Boonville 38.9588
## 9                           Boont Amber Ale (2011)        Boonville 38.9588
## 10                    Poleeko Gold Pale Ale (2009)        Boonville 38.9588
## 11                              River Pig Pale Ale          Hayward 37.6328
## 12                                On-On Ale (2008) Colorado Springs 38.8674
## 13                         Greenbelt Farmhouse Ale           Denton 33.2176
## 14             Green Bullet Organic India Pale Ale          Boulder 40.0249
## 15                           Independence Pass Ale            Aspen 39.1950
## 16                         Old Red Beard Amber Ale          Astoria 46.1856
## 17                                Dirty Blonde Ale          Detroit 42.3834
## 18                            Avery India Pale Ale          Boulder 40.0249
## 19                               Ellie’s Brown Ale          Boulder 40.0249
## 20                         Hog Wild India Pale Ale    Fuquay-Varina 35.5958
## 21                            Back East Golden Ale       Bloomfield 36.7401
## 22                                   Back East Ale       Bloomfield 36.7401
## 23                      Truck Stop Honey Brown Ale          Gadsden 34.0090
## 24                              Naked Pig Pale Ale          Gadsden 34.0090
## 25                        Topcutter India Pale Ale           Yakima 46.5923
## 26                               Field 41 Pale Ale           Yakima 46.5923
## 27                          Ballast Point Pale Ale        San Diego 32.8312
## 28                          Big Eye India Pale Ale        San Diego 32.8312
## 29                                 All Nighter Ale     Williamsburg 37.2692
## 30                             Banner American Ale     Williamsburg 37.2692
## 31                              Hayride Autumn Ale         Lewiston 44.0915
## 32                       Celsius Summer Ale (2012)         Lewiston 44.0915
## 33                            Pamola Xtra Pale Ale         Lewiston 44.0915
## 34                                  Watermelon Ale           Lowell 42.6389
## 35                        Fenway American Pale Ale           Lowell 42.6389
## 36                      Bunker Hill Blueberry Ale            Lowell 42.6389
## 37                           Bent Paddle Black Ale           Duluth 46.7756
## 38                       Steel Rail Extra Pale Ale  South Deerfield 42.4795
## 39          West Portal Colorado Common Summer Ale       Broomfield 39.9541
## 40                               413 Farmhouse Ale        Sheffield 34.7570
## 41                             Scape Goat Pale Ale         Missoula 46.8750
## 42                        Montana Trout Slayer Ale         Missoula 46.8750
## 43                           Moose Drool Brown Ale         Missoula 46.8750
## 44                         Powder Hound Winter Ale         Missoula 46.8750
## 45                    Moose Drool Brown Ale (2011)         Missoula 46.8750
## 46                 Montana Trout Slayer Ale (2012)         Missoula 46.8750
## 47                      Scape Goat Pale Ale (2010)         Missoula 46.8750
## 48                 Montana Trout Slayer Ale (2009)         Missoula 46.8750
## 49                    Moose Drool Brown Ale (2009)         Missoula 46.8750
## 50                               Jalapeno Pale Ale        Charlotte 35.2080
## 51                                  Single Hop Ale         Hamilton 39.3938
## 52                                    Sawtooth Ale         Hamilton 39.3938
## 53                              Aftermath Pale Ale         Temecula 33.4928
## 54                          American India Red Ale           Denver 39.7621
## 55                                Colorado Red Ale           Denver 39.7621
## 56                          Saddle Bronc Brown Ale         Sheridan 44.7962
## 57                       Bomber Mountain Amber Ale         Sheridan 44.7962
## 58                               Coconut Brown Ale        Marquette 46.5440
## 59                            Brewerhood Brown Ale          Lincoln 40.8090
## 60                    Last Call Imperial Amber Ale          Lincoln 40.8090
## 61           543 Skull Creek Fresh Hopped Pale Ale          Lincoln 40.8090
## 62                                834 Happy As Ale          Lincoln 40.8090
## 63                            Full Nelson Pale Ale            Afton 44.9042
## 64                     Full Nelson Pale Ale (2010)            Afton 44.9042
## 65                           Blue Point Summer Ale        Patchogue 40.7621
## 66                          Killer Whale Cream Ale     Jacksonville 30.3322
## 67                      Duke's Cold Nose Brown Ale     Jacksonville 30.3322
## 68                      Firestarter India Pale Ale            Eagle 43.7223
## 69                         Kilt Dropper Scotch Ale            Eagle 43.7223
## 70                     Farmer Wirtz India Pale Ale            Eagle 43.7223
## 71                        Slow & Steady Golden Ale            Eagle 43.7223
## 72  Moe's Original Bar B Que 'Bama Brew Golden Ale            Eagle 43.7223
## 73                           Live Local Golden Ale            Eagle 43.7223
## 74                 Screaming Eagle Special Ale ESB            Eagle 43.7223
## 75                                Kindler Pale Ale            Eagle 43.7223
## 76                               Awry Rye Pale Ale            Eagle 43.7223
## 77                              Demshitz Brown Ale            Eagle 43.7223
## 78                      Firestarter India Pale Ale            Eagle 43.7223
## 79                         Samuel Adams Summer Ale           Boston 42.3188
## 80                                 Hoopla Pale Ale          Boulder 40.0249
## 81                         Bozone Select Amber Ale          Bozeman 45.6832
## 82                                SummerBright Ale           Denver 39.7621
## 83                                   Avalanche Ale           Denver 39.7621
## 84                            Rollin Dirty Red Ale            Tampa 27.9942
## 85                                  Le Flaneur Ale     Grand Rapids 42.9620
## 86                Hubris Quadrupel Anniversary Ale     Grand Rapids 42.9620
## 87                            Escoffier Bretta Ale     Grand Rapids 42.9620
## 88                                  Tampa Pale Ale        Tampa Bay      NA
## 89                          Orange Grove Wheat Ale        Tampa Bay      NA
## 90                                 Broad Brook Ale     East Windsor      NA
## 91                       Northern Lights Amber Ale        Anchorage 61.1508
## 92                                  Polar Pale Ale        Anchorage 61.1508
## 93                             Chugach Session Ale        Anchorage 61.1508
## 94                             East India Pale Ale         Brooklyn 40.6501
## 95                             Brooklyn Summer Ale         Brooklyn 40.6501
## 96                             East India Pale Ale         Brooklyn 40.6501
## 97                      Brooklyn Summer Ale (2011)         Brooklyn 40.6501
## 98                     Tule Duck Red Ale (Current)             Reno 39.5497
## 99           Original Orange Blossom Ale (Current)             Reno 39.5497
## 100                                   Pale Alement    Michigan City 41.7099
##           lng     state_name
## 1   -122.5964     Washington
## 2   -122.5964     Washington
## 3   -122.6500         Oregon
## 4   -122.3448     Washington
## 5   -122.4430     California
## 6    -92.7471       Missouri
## 7    -92.7471       Missouri
## 8    -92.7471       Missouri
## 9    -92.7471       Missouri
## 10   -92.7471       Missouri
## 11  -122.0772     California
## 12  -104.7606       Colorado
## 13   -97.1419          Texas
## 14  -105.2523       Colorado
## 15  -106.8369       Colorado
## 16  -123.8053         Oregon
## 17   -83.1024       Michigan
## 18  -105.2523       Colorado
## 19  -105.2523       Colorado
## 20   -78.7794 North Carolina
## 21  -107.9734     New Mexico
## 22  -107.9734     New Mexico
## 23   -86.0156        Alabama
## 24   -86.0156        Alabama
## 25  -120.5496     Washington
## 26  -120.5496     Washington
## 27  -117.1225     California
## 28  -117.1225     California
## 29   -76.7076       Virginia
## 30   -76.7076       Virginia
## 31   -70.1681          Maine
## 32   -70.1681          Maine
## 33   -70.1681          Maine
## 34   -71.3217  Massachusetts
## 35   -71.3217  Massachusetts
## 36   -71.3217  Massachusetts
## 37   -92.1392      Minnesota
## 38   -72.5947  Massachusetts
## 39  -105.0527       Colorado
## 40   -87.6977        Alabama
## 41  -114.0214        Montana
## 42  -114.0214        Montana
## 43  -114.0214        Montana
## 44  -114.0214        Montana
## 45  -114.0214        Montana
## 46  -114.0214        Montana
## 47  -114.0214        Montana
## 48  -114.0214        Montana
## 49  -114.0214        Montana
## 50   -80.8304 North Carolina
## 51   -84.5653           Ohio
## 52   -84.5653           Ohio
## 53  -117.1315     California
## 54  -104.8759       Colorado
## 55  -104.8759       Colorado
## 56  -106.9643        Wyoming
## 57  -106.9643        Wyoming
## 58   -87.4082       Michigan
## 59   -96.6788       Nebraska
## 60   -96.6788       Nebraska
## 61   -96.6788       Nebraska
## 62   -96.6788       Nebraska
## 63   -92.8174      Minnesota
## 64   -92.8174      Minnesota
## 65   -73.0185       New York
## 66   -81.6749        Florida
## 67   -81.6749        Florida
## 68  -116.3862          Idaho
## 69  -116.3862          Idaho
## 70  -116.3862          Idaho
## 71  -116.3862          Idaho
## 72  -116.3862          Idaho
## 73  -116.3862          Idaho
## 74  -116.3862          Idaho
## 75  -116.3862          Idaho
## 76  -116.3862          Idaho
## 77  -116.3862          Idaho
## 78  -116.3862          Idaho
## 79   -71.0846  Massachusetts
## 80  -105.2523       Colorado
## 81  -111.0552        Montana
## 82  -104.8759       Colorado
## 83  -104.8759       Colorado
## 84   -82.4451        Florida
## 85   -85.6562       Michigan
## 86   -85.6562       Michigan
## 87   -85.6562       Michigan
## 88         NA           <NA>
## 89         NA           <NA>
## 90         NA           <NA>
## 91  -149.1091         Alaska
## 92  -149.1091         Alaska
## 93  -149.1091         Alaska
## 94   -73.9496       New York
## 95   -73.9496       New York
## 96   -73.9496       New York
## 97   -73.9496       New York
## 98  -119.8483         Nevada
## 99  -119.8483         Nevada
## 100  -86.8705        Indiana
dataWithMap3 = dataWithMap2 %>% group_by(City) %>% mutate(count = n())
dataWithMap3 = dataWithMap3 %>% group_by(City)%>%filter(row_number()==1)%>%
  filter((!is.na(lng))&(!is.na(lat)))
states <- map_data("state")
p <- ggplot() + 
  geom_polygon(data = states, aes(x = long, y = lat, group = group), fill = "yellow", color = "black") + 
  coord_quickmap()

p <-p + geom_point(data = dataWithMap3, aes(x = lng, y = lat, size=count,alpha=count),color="blue")+
  ggtitle("What cities produce many kinds of Ales") + xlab("Longitude")+ylab("Latitute")
ggplotly(p)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.